Data

# Loading email data set: consists of "text data" and "dummy variable"
email_df <- data.table::fread( "spam_or_not_spam.csv", header = T )
# Randomizing data
set.seed(46234)
email_df <- email_df[ sample( 1:nrow( email_df ) ),  ]
# Factoring dummy variable
email_df$label <- factor( email_df$label )

Packages Used

library(pacman)
p_load(fastverse, magrittr, here, skimr, dplyr, 
       ggplot2, ggthemes, equatiomatic, gridExtra, 
       caret, naivebayes, knitr, kableExtra, shiny, 
       data.table, tidymodels, ggthemes, wordcloud,
       tm, SnowballC, RColorBrewer, e1071, data.table, 
       stringr, disk.frame, knitr, ranger, DT
      )

Preprocessing & Cleaning (stemming)

To retrieve the root of a word (eg, doing -> do), options are ā€œstemmingā€ &ā€œlemmatizationā€.

STEMMING: faster but maybe not as effective
LEMMATIZATION: slower but more effective
More on this: Here

VectorSource(): creates one document for each email
Vcorpus(): creates a volatile corpus from these individual emails

email_corpus <- VCorpus(
                  VectorSource(
                      email_df$email
                      )
                  )

# Using `tm` package to stem email content
email_corpus <- tm::tm_map( email_corpus, 
                            tm::stemDocument )

# Removing puctuations
email_corpus = tm_map( email_corpus, 
                       removePunctuation )

# Removing stopwords
email_corpus <- tm_map( email_corpus, 
                       removeWords, stopwords( "en" ) )

# OPTIONAL -- Removing two most frequent stopwords: "NUMBER", "URL"
 email_corpus <- tm_map( email_corpus,
 removeWords, c("NUMBER", "number", "url", "URL") )
 
# Testing keeping URL
 # email_corpus <- tm_map( email_corpus,
                        # removeWords, c("NUMBER", "number") )
 
# OPTIONAL -- Removing extra white space
# email_corpus <- tm_map( email_corpus,
#                         stringr::str_squish)
 
# DocumentTermMatrix(): tokenize the email corpus.
email_dtm <- tm::DocumentTermMatrix( sample( email_corpus, length( email_corpus ) ) )

Preprocessed Data for Visualization

reverse_email <- data.frame(
                text = sapply( email_corpus, as.character ), 
                stringsAsFactors = FALSE, 
                type = email_df$label
                )

Most Frequent Words in Data

Visualing text data after cleaning and pre-processing

wordcloud( reverse_email$text, 
           max.words = 150, 
           colors = brewer.pal( 7, "Dark2" ), 
           random.order = FALSE ) 

Subsetting spam vs.Ā non-spam datasets

Visualizing most frequent words in spam data

# Subsetting to spam == 1
spam <- reverse_email %>% filter( type == 1 )
wordcloud( spam$text, 
           max.words = 150, 
           colors = brewer.pal( 7, "Dark2" ), 
           random.order = FALSE,
           main = "Spam")

Visualing most frequent words in non-spam data

# Subsetting to spam == 0
ham <- reverse_email %>% filter( type == 0 )
wordcloud( ham$text, 
           max.words = 150, 
           colors = brewer.pal( 7, "Dark2" ), 
           random.order = FALSE,
           main = "Non-Spam") 

Splitting Data

Data is sorted randomly, so it’s easy to split Testing vs.Ā Training

email_dtm_train <- email_dtm[   1:2400, ]         # 80% training
email_dtm_test  <- email_dtm[2401:3000, ]         # 20% testing

# Adding labels for convenience
email_train_label <- email_df[   1:2400, ]$label
email_test_label  <- email_df[2401:3000, ]$label

Checking that data is split proportionally

prop_table = data.frame(c(prop.table( table( email_train_label ) )[[2]], #Train
                          prop.table( table( email_train_label ) )[[1]]),
                        c(prop.table( table( email_test_label ) )[[2]], # Test
                          prop.table( table( email_test_label ) )[[1]])
                        )

rownames(prop_table) = c("Spam", "Non-Spam")
names(prop_table) = c("Train", "Test")
kable(prop_table, digits = 3) %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Train Test
Spam 0.17 0.152
Non-Spam 0.83 0.848

Trimming: reducing number of features

We currently have 25,050 variables (too many!)
Defining threshold (eg. 1 == 1%) Goal: Eliminate words that appear in __% of records in the training data

min_freq <- round( email_dtm$nrow * ( ( threshold = 10.0 ) / 100 ),     # using 10%
               0 
              )

Create vector of most frequent words

freq_words <- findFreqTerms( x = email_dtm, 
                             lowfreq = min_freq )

# Filter the DTM
email_dtm_freq_train <- email_dtm_train[ , freq_words]
email_dtm_freq_test  <- email_dtm_test[ , freq_words]

The list of our most frequent words

# I know there's probably a quicker way todo this but hey it was only 14 lines
freq_words_plus_1 =  c(freq_words, "")
freq_words_dt = data.frame(
  c(freq_words_plus_1[seq( 1+0*1,16*1)]),
  c(freq_words_plus_1[seq(1*16+1,16*2)]),
  c(freq_words_plus_1[seq(2*16+1,16*3)]),
  c(freq_words_plus_1[seq(3*16+1,16*4)]),
  c(freq_words_plus_1[seq(4*16+1,16*5)]),
  c(freq_words_plus_1[seq(5*16+1,16*6)]),
  c(freq_words_plus_1[seq(6*16+1,16*7)]),
  c(freq_words_plus_1[seq(7*16+1,16*8)]),
  c(freq_words_plus_1[seq(8*16+1,16*9)]),
  c(freq_words_plus_1[seq(9*16+1,16*10)]),
  c(freq_words_plus_1[seq(10*16+1,16*11)]),
  c(freq_words_plus_1[seq(11*16+1,16*12)]),
  c(freq_words_plus_1[seq(12*16+1,16*13)])
)

# datatable(freq_words_dt, rownames = FALSE, colnames = FALSE, filter="none", options = list(pageLength = 13, scrollX=T) )

kable(freq_words_dt, digits = 3, col.names = NULL) %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>% 
  column_spec (1:13,border_left = T, border_right = T) %>% 
  row_spec(1:16, extra_css = "border-bottom: 1px solid")
actual can differ found instal long much onli problem right site test want
address case doe free interest look must order process rpm softwar thank way
also chang doesn get internet lot name origin product run someth thing web
american check don give invest made nation packag program said spam think week
ani click email good issu mail need part provid say spamassassin time well
anoth code end got just make net peopl put secur sponsor today whi
anyon com error govern keep manag network perl rate see start trade will
back come even great know mani never person razor seem state tri window
base compani everi group last market new phone read send still two within
becaus comput exmh help life may next place real sent subject type without
befor countri file high like mean now pleas realli sep support unit work
best current find home line messag numbertnumber point receiv server sure unsubscrib world
better data first hyperlink link might offer post releas servic system use write
build date follow idea linux million old power remov set take user wrote
busi day fork includ list money onc price report show talk veri year
call develop form inform live month one probabl requir sinc technolog version

Final, cleaned, prepared dataset

# Simple dummy transformation fn.
convert_values <- function(x){
                    x = ifelse( x > 0, "Yes", "No" )
}

# Declaring final `train` and `test` datasets
email_train <- apply( email_dtm_freq_train, MARGIN = 2,
                      convert_values )
email_test  <- apply( email_dtm_freq_test, MARGIN = 2,
                      convert_values )

Preview data

library(DT)
datatable(email_train, rownames = FALSE, filter="none", options = list(pageLength = 5, scrollX=T) )

Methods

# I got all of this code from stackoverflow by doing some simple googling for how to display a confusion matrix
## Source https://stackoverflow.com/questions/23891140/r-how-to-visualize-confusion-matrix-using-the-caret-package

draw_confusion_matrix <- function(cm) {
  total <- sum(cm$table)
  res <- as.numeric(cm$table)
  # Generate color gradients. Palettes come from RColorBrewer.
  greenPalette <- c("#F7FCF5","#E5F5E0","#C7E9C0","#A1D99B","#74C476","#41AB5D","#238B45","#006D2C","#00441B")
  redPalette <- c("#FFF5F0","#FEE0D2","#FCBBA1","#FC9272","#FB6A4A","#EF3B2C","#CB181D","#A50F15","#67000D")
  getColor <- function (greenOrRed = "green", amount = 0) {
    if (amount == 0)
      return("#FFFFFF")
    palette <- greenPalette
    if (greenOrRed == "red")
      palette <- redPalette
    colorRampPalette(palette)(100)[10 + ceiling(90 * amount / total)]
  }
  # set the basic layout
  layout(matrix(c(1,1,2)))
  par(mar=c(2,2,2,2))
  plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
  title('Confusion Matrix', cex.main=2)
  # create the matrix 
  classes = colnames(cm$table)
  rect(150, 430, 240, 370, col=getColor("green", res[1]))
  text(195, 435, "Non-Spam", cex=1.2)
  rect(250, 430, 340, 370, col=getColor("red", res[3]))
  text(295, 435, "Spam", cex=1.2)
  text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
  text(245, 450, 'Actual', cex=1.3, font=2)
  rect(150, 305, 240, 365, col=getColor("red", res[2]))
  rect(250, 305, 340, 365, col=getColor("green", res[4]))
  text(140, 400, "Non-Spam", cex=1.2, srt=90)
  text(140, 335, "Spam", cex=1.2, srt=90)
  # add in the cm results
  text(195, 400, res[1], cex=1.6, font=2, col='black')
  text(195, 335, res[2], cex=1.6, font=2, col='black')
  text(295, 400, res[3], cex=1.6, font=2, col='black')
  text(295, 335, res[4], cex=1.6, font=2, col='black')
  # add in the specifics 
  plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "Metrics", xaxt='n', yaxt='n')
  text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
  text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
  text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
  text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
  text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
  text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
  text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
  text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
  text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
  text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
  # add in the accuracy information 
  text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
  text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
  text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
  text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
} 

Naive Bayes

#Create model from the training dataset
bayes_classifier <- e1071::naiveBayes( email_train, 
                                       email_train_label )

# Predicting on test data
email_test_pred <- predict( bayes_classifier, 
                            email_test )

Naive Bayes Results

# Display results
naive_bayes_results = confusionMatrix( data = email_test_pred, 
                                       reference = email_test_label,
                                       positive = "1", 
                                       dnn = c("Prediction", "Actual") )

draw_confusion_matrix(naive_bayes_results)

Lasso

# Splitting for 5-fold cross-validation
folds <- email_train %>% vfold_cv(v = 5)

# Defining Lambdas (from Lecture 005)
lambdas <- data.frame( penalty = c( 0, 10^seq( from = 5, to = -2, length = 100 ) ) )

# Defining the recipe
data_recipe <- recipe(
                email_train_label ~ ., 
                data = email_train 
                ) %>% 
              update_role(V1, new_role = 'id variable')  %>% 
              step_dummy(all_nominal(), - all_outcomes())

# Defining Lasso Model
lasso <- linear_reg(
          penalty = tune(), 
          mixture = 1) %>% 
          set_engine("glmnet")

# Setting up workflow
workflow_lasso <- workflow() %>%
                    add_recipe( data_recipe ) %>%
                    add_model( lasso )
# Parallelize 
doParallel::registerDoParallel(cores = 4)

# CV
lasso_cv <- workflow_lasso %>% 
              tune_grid(
                resamples = folds,
                grid = lambdas,
                metrics = metric_set(rmse, mae)
              )

Lasso Results

# Find best models            ( source: juliasilge.com/blog/lasso-the-office/ )
lasso_cv %>% collect_metrics() %>%
            ggplot(aes(penalty, mean, color = .metric)) +
            geom_errorbar(aes(
              ymin = mean - std_err,
              ymax = mean + std_err
            ),
            alpha = 0.5
            ) +
            geom_line(size = 1.5) +
            facet_wrap(~.metric, scales = "free", nrow = 2) +
            theme(legend.position = "none") + theme_base() + 
            # xlim(0, 1000) +
            scale_x_log10()

best_lasso_mae = lasso_cv %>% show_best(metric = "mae") %>% filter(penalty == min(penalty))
best_lasso_rmse = lasso_cv %>% show_best(metric = "rmse") %>% filter(penalty == min(penalty))
best_lasso = rbind(best_lasso_mae, best_lasso_rmse)
names(best_lasso) = c("Penalty", "Metric", "Estimator", "Mean", "n", "Standard Error", ".config")
kable(best_lasso, digits = 3) %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Penalty Metric Estimator Mean n Standard Error .config
0.031 mae standard 0.283 5 0.002 Preprocessor1_Model009
0.023 rmse standard 0.376 5 0.004 Preprocessor1_Model007
# Get test accuracy
best_lasso_workflow = workflow_lasso %>%
  finalize_workflow(select_best(lasso_cv, metric = "mae")) %>%
  fit(data = email_train)

best_lasso = best_lasso_workflow %>% extract_fit_parsnip()

## Clean test data
email_test_clean = recipe(
                email_test_label ~ ., 
                data = email_test 
                ) %>% 
              update_role(V1, new_role = 'id variable')  %>% 
              step_dummy(all_nominal(), - all_outcomes()) %>%
              prep() %>% juice()

## Make predictions
lasso_predictions = predict(best_lasso, email_test_clean)
lasso_predictions = lasso_predictions %>% mutate(prediction = ifelse(.pred < 0.5, 0, 1))
email_test_clean$predictions = lasso_predictions$prediction

## Calculate accuracy
email_test_clean = email_test_clean %>% mutate(accurate = ifelse(predictions == email_test_label, 1, 0))
acc = sum(email_test_clean$accurate) / nrow(email_test_clean)
# print(acc)

Logistic Lasso

# Set seed
set.seed(9753)
# converting outcome variable into character vector
email_train <- email_train %>% 
                  mutate(
                    outcome_as_vector = ifelse(email_train_label == 1, "Yes", "No")
                  ) 
# Split for 5-fold cross-validation
folds <- vfold_cv(email_train, v = 5)
# Defining the recipe
data_recipe <- recipe(
                  outcome_as_vector ~ ., 
                  data = email_train 
                ) %>% 
                  step_rm(email_train_label) %>% 
                  update_role(V1, new_role = 'id variable') %>%
                  # step_num2factor( email_train_label ) %>% 
                  step_dummy(all_nominal(), - all_outcomes()) %>%
                  step_zv(all_predictors()) %>%
                  step_normalize(all_predictors())
# Defining Lambdas (from Lecture 005)
lambdas <- data.frame( penalty = c( 0, 10^seq( from = 5, to = -2, length = 100 ) ) )
# Defining Lasso Model
log_lasso <- logistic_reg(
            mode = 'classification',
            penalty = tune(), 
            mixture = 1) %>% 
            set_engine("glmnet")
# Setting up workflow
workflow_lasso <- workflow() %>%
                    add_recipe( data_recipe ) %>%
                    add_model( log_lasso )
# Parallelize 
# doParallel::registerDoParallel(cores = 4)
# CV
log_lasso_cv <- workflow_lasso %>% 
                tune_grid(
                  resamples = folds,
                  metrics = metric_set(accuracy, roc_auc, sens, spec, precision),
                  grid = grid_latin_hypercube(penalty(), size = 5),
                  # grid = grid_latin_hypercube(penalty() , mixture(), size = 20),
                  control = control_grid(parallel_over = 'resamples')
                )

Logistic Lasso Results

# Find best models            ( source: juliasilge.com/blog/lasso-the-office/ )
# log_lasso_cv %>% collect_metrics() %>% arrange(mean) %>%
#                 ggplot(aes(penalty, mean, color = .metric)) +
#                 # geom_line(size = 1.5) +
#                 # geom_errorbar(aes(
#                 #   ymin = mean - std_err,
#                 #   ymax = mean + std_err
#                 # ),
#                 # alpha = 0.5
#                 # ) +
#                 geom_smooth(aes(penalty, mean, color = .metric)
#                             , size = 1 
#                             , alpha = 0.5
#                             , method = "lm") +
#                 # facet_wrap(~.metric, scales = "free", nrow = 3) +
#                 # scale_x_log10() +
#                 # scale_y_log10() +
#                 theme(legend.position = "none") + theme_base() #+ xlim(0, 0.1)
log_lasso_cv_results = log_lasso_cv %>% collect_metrics() %>% group_by(.metric) %>% summarize(mean_accuracy = mean(mean, na.rm = T))
log_lasso_cv_results[1] = c("Accuracy", 
                       "Precision", 
                       "Area Under the Curve", 
                       "Sensitivity", 
                       "Specificity")
names(log_lasso_cv_results) = c("Metric", "Mean")
kable(log_lasso_cv_results, digits = 3) %>% 
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Metric Mean
Accuracy 0.809
Precision 0.831
Area Under the Curve 0.508
Sensitivity 0.967
Specificity 0.040

Logistic

# set.seed(9754)
set.seed(9753)
# converting outcome variable into character vector
email_train <- email_train %>% 
                        mutate(
                          outcome_as_vector = ifelse(email_train_label == 1, "Yes", "No")
                        ) 
# Split for 5-fold cross-validation
folds <- vfold_cv(email_train, v = 5)
# small sample of data 
# sample <- email_train[, ]
# Defining the recipe
data_recipe <- recipe(
                  outcome_as_vector ~ ., 
                  data = email_train 
                    ) %>% 
                  step_rm(email_train_label) %>% 
                  update_role(V1, new_role = 'id variable') %>%
                  # step_num2factor( email_train_label ) %>% 
                  step_dummy(all_nominal(), - all_outcomes()) %>%
                  step_zv(all_predictors()) %>%
                  step_normalize(all_predictors())
# Define the model
model_logistic <- logistic_reg(
                    mode = 'classification') %>%    # Simple LogReg bc no penalty
                    set_engine('glm')
# Workflow
workflow_logistic <- workflow() %>% 
                      add_recipe(data_recipe) %>% 
                      add_model(model_logistic)
# Cross-Validation
cv_logistic <- workflow_logistic %>%
                fit_resamples(
                  resamples = folds,
                  metrics = metric_set(accuracy, roc_auc, sens, spec, precision)
                )

Logistic Regression Results

# Visualizing output
log_reg_results = cv_logistic %>% collect_metrics() %>% 
      select(.metric, mean)
log_reg_results[1] = c("Accuracy", 
                       "Precision", 
                       "Area Under the Curve", 
                       "Sensitivity", 
                       "Specificity")
names(log_reg_results) = c("Metric", "Mean")
kable(log_reg_results, digits = 3) %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Metric Mean
Accuracy 0.797
Precision 0.832
Area Under the Curve 0.511
Sensitivity 0.947
Specificity 0.071

Random Forests

random_forest <- train(
                  x = email_train_rf,
                  y = email_train_label_rf,
                  method = "ranger", 
                  num.trees = 200,
                  importance = "impurity",
                  trControl = trainControl(method = "cv", 
                                           number = 3,
                                           verboseIter = TRUE
                  )
                )
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2, splitrule = gini, min.node.size = 1 on full training set

Random Forests Results

# Checking "variable importances 
top_25words = varImp(random_forest, scale = TRUE)$importance %>% 
                rownames_to_column() %>% 
                arrange(-Overall) %>% 
                top_n(25) 
## Selecting by Overall
# ggplot(data = top_25words) + 
# aes(x = reorder(rowname, Overall), y = Overall) +
# geom_col(stat = "identify") + 
# labs(title = "Most Predictive Words") +
# coord_flip()

ggplot(data = top_25words, 
   aes(x=reorder(rowname, Overall),
       y = Overall)) +
        geom_bar(stat = "identity") +
        theme_base() +
        theme(axis.text.x=element_text(angle=50, hjust=1))+
        xlab("Top 25 Predictive Words(stemed)")+
        ylab("Frequency of Word") +
        labs(title = "Most Predictive Words") +
        theme(plot.title = element_text(hjust = 0.5))

# Redeclaring `email_test`
email_test  <- apply( email_dtm_freq_test, MARGIN = 2,
                      convert_values )
email_test <- cbind( email_test, 
                     email_test_label )

# Fitting to the test data
predictions <- predict(random_forest, email_test)
random_forest_results = confusionMatrix(predictions, email_test_label)
draw_confusion_matrix(random_forest_results)

beepr::beep(sound = 3)

Results

# Creating df with different metrics and results
comparing_acc_table = data.frame(
  
  c(
    "Naive Bayes",
    "Lasso",
    "Logistic Lasso",
    "Logistic",
    "Random Forest"
    ),
  
  c(
    naive_bayes_results[["overall"]][["Accuracy"]],
    acc,
    log_lasso_cv_results$Mean[1],
    log_reg_results$Mean[1],
    random_forest_results[["overall"]][["Accuracy"]]
    ),
  
  c(
    naive_bayes_results[["byClass"]][["Sensitivity"]],
    0,
    log_lasso_cv_results$Mean[4],
    log_reg_results$Mean[4],
    random_forest_results[["byClass"]][["Sensitivity"]]
  ),
  
  c(
    naive_bayes_results[["byClass"]][["Specificity"]],
    1,
    log_lasso_cv_results$Mean[5],
    log_reg_results$Mean[5],
    random_forest_results[["byClass"]][["Specificity"]]
  )
  
)

names(comparing_acc_table) = c("Method", "Accuracy", "Sensitivity", "Specificity")
kable(comparing_acc_table, digits = 3) %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
Method Accuracy Sensitivity Specificity
Naive Bayes 0.788 0.044 0.921
Lasso 0.848 0.000 1.000
Logistic Lasso 0.809 0.967 0.040
Logistic 0.797 0.947 0.071
Random Forest 0.847 0.998 0.000